install library

#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages('plotly')
#install.packages("lubridate")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Read data

theUrl_train <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/train.csv"
Train <- data.frame(read.csv(file = theUrl_train, header = TRUE, sep = ","),split='train')
theUrl_test <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/test.csv"
Test <- data.frame(read.csv(file = theUrl_test, header = TRUE, sep = ","),split='test')
theUrl_store <- "/Users/chaoiting/Documents/BUAN 6356//Project/all/store.csv"
Store <- read.csv(file = theUrl_store, header = TRUE, sep = ",")

check data

summary(Train)
##      Store          DayOfWeek             Date             Sales      
##  Min.   :   1.0   Min.   :1.000   2013-01-02:   1115   Min.   :    0  
##  1st Qu.: 280.0   1st Qu.:2.000   2013-01-03:   1115   1st Qu.: 3727  
##  Median : 558.0   Median :4.000   2013-01-04:   1115   Median : 5744  
##  Mean   : 558.4   Mean   :3.998   2013-01-05:   1115   Mean   : 5774  
##  3rd Qu.: 838.0   3rd Qu.:6.000   2013-01-06:   1115   3rd Qu.: 7856  
##  Max.   :1115.0   Max.   :7.000   2013-01-07:   1115   Max.   :41551  
##                                   (Other)   :1010519                  
##    Customers           Open            Promo        StateHoliday
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000   0:986159    
##  1st Qu.: 405.0   1st Qu.:1.0000   1st Qu.:0.0000   a: 20260    
##  Median : 609.0   Median :1.0000   Median :0.0000   b:  6690    
##  Mean   : 633.1   Mean   :0.8301   Mean   :0.3815   c:  4100    
##  3rd Qu.: 837.0   3rd Qu.:1.0000   3rd Qu.:1.0000               
##  Max.   :7388.0   Max.   :1.0000   Max.   :1.0000               
##                                                                 
##  SchoolHoliday      split        
##  Min.   :0.0000   train:1017209  
##  1st Qu.:0.0000                  
##  Median :0.0000                  
##  Mean   :0.1786                  
##  3rd Qu.:0.0000                  
##  Max.   :1.0000                  
## 
sum(is.na(Train$Store))
## [1] 0
summary(Store)
##      Store        StoreType Assortment CompetitionDistance
##  Min.   :   1.0   a:602     a:593      Min.   :   20.0    
##  1st Qu.: 279.5   b: 17     b:  9      1st Qu.:  717.5    
##  Median : 558.0   c:148     c:513      Median : 2325.0    
##  Mean   : 558.0   d:348                Mean   : 5404.9    
##  3rd Qu.: 836.5                        3rd Qu.: 6882.5    
##  Max.   :1115.0                        Max.   :75860.0    
##                                        NA's   :3          
##  CompetitionOpenSinceMonth CompetitionOpenSinceYear     Promo2      
##  Min.   : 1.000            Min.   :1900             Min.   :0.0000  
##  1st Qu.: 4.000            1st Qu.:2006             1st Qu.:0.0000  
##  Median : 8.000            Median :2010             Median :1.0000  
##  Mean   : 7.225            Mean   :2009             Mean   :0.5121  
##  3rd Qu.:10.000            3rd Qu.:2013             3rd Qu.:1.0000  
##  Max.   :12.000            Max.   :2015             Max.   :1.0000  
##  NA's   :354               NA's   :354                              
##  Promo2SinceWeek Promo2SinceYear          PromoInterval
##  Min.   : 1.0    Min.   :2009                    :544  
##  1st Qu.:13.0    1st Qu.:2011    Feb,May,Aug,Nov :130  
##  Median :22.0    Median :2012    Jan,Apr,Jul,Oct :335  
##  Mean   :23.6    Mean   :2012    Mar,Jun,Sept,Dec:106  
##  3rd Qu.:37.0    3rd Qu.:2013                          
##  Max.   :50.0    Max.   :2015                          
##  NA's   :544     NA's   :544
sum(is.na(Store$Store))
## [1] 0
#Check store id is unique
nrow(distinct(Store))==nrow(Store)
## [1] TRUE
#Join train data and store
Train_data <- left_join(Train, Store, by="Store")
summary(Train_data)
##      Store          DayOfWeek             Date             Sales      
##  Min.   :   1.0   Min.   :1.000   2013-01-02:   1115   Min.   :    0  
##  1st Qu.: 280.0   1st Qu.:2.000   2013-01-03:   1115   1st Qu.: 3727  
##  Median : 558.0   Median :4.000   2013-01-04:   1115   Median : 5744  
##  Mean   : 558.4   Mean   :3.998   2013-01-05:   1115   Mean   : 5774  
##  3rd Qu.: 838.0   3rd Qu.:6.000   2013-01-06:   1115   3rd Qu.: 7856  
##  Max.   :1115.0   Max.   :7.000   2013-01-07:   1115   Max.   :41551  
##                                   (Other)   :1010519                  
##    Customers           Open            Promo        StateHoliday
##  Min.   :   0.0   Min.   :0.0000   Min.   :0.0000   0:986159    
##  1st Qu.: 405.0   1st Qu.:1.0000   1st Qu.:0.0000   a: 20260    
##  Median : 609.0   Median :1.0000   Median :0.0000   b:  6690    
##  Mean   : 633.1   Mean   :0.8301   Mean   :0.3815   c:  4100    
##  3rd Qu.: 837.0   3rd Qu.:1.0000   3rd Qu.:1.0000               
##  Max.   :7388.0   Max.   :1.0000   Max.   :1.0000               
##                                                                 
##  SchoolHoliday      split         StoreType  Assortment
##  Min.   :0.0000   train:1017209   a:551627   a:537445  
##  1st Qu.:0.0000                   b: 15830   b:  8294  
##  Median :0.0000                   c:136840   c:471470  
##  Mean   :0.1786                   d:312912             
##  3rd Qu.:0.0000                                        
##  Max.   :1.0000                                        
##                                                        
##  CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear
##  Min.   :   20       Min.   : 1.0              Min.   :1900            
##  1st Qu.:  710       1st Qu.: 4.0              1st Qu.:2006            
##  Median : 2330       Median : 8.0              Median :2010            
##  Mean   : 5430       Mean   : 7.2              Mean   :2009            
##  3rd Qu.: 6890       3rd Qu.:10.0              3rd Qu.:2013            
##  Max.   :75860       Max.   :12.0              Max.   :2015            
##  NA's   :2642        NA's   :323348            NA's   :323348          
##      Promo2       Promo2SinceWeek  Promo2SinceYear 
##  Min.   :0.0000   Min.   : 1.0     Min.   :2009    
##  1st Qu.:0.0000   1st Qu.:13.0     1st Qu.:2011    
##  Median :1.0000   Median :22.0     Median :2012    
##  Mean   :0.5006   Mean   :23.3     Mean   :2012    
##  3rd Qu.:1.0000   3rd Qu.:37.0     3rd Qu.:2013    
##  Max.   :1.0000   Max.   :50.0     Max.   :2015    
##                   NA's   :508031   NA's   :508031  
##           PromoInterval   
##                  :508031  
##  Feb,May,Aug,Nov :118596  
##  Jan,Apr,Jul,Oct :293122  
##  Mar,Jun,Sept,Dec: 97460  
##                           
##                           
## 
summary(is.na(Train_data))
##    Store         DayOfWeek          Date           Sales        
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1017209   FALSE:1017209   FALSE:1017209   FALSE:1017209  
##                                                                 
##  Customers          Open           Promo         StateHoliday   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1017209   FALSE:1017209   FALSE:1017209   FALSE:1017209  
##                                                                 
##  SchoolHoliday     split         StoreType       Assortment     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1017209   FALSE:1017209   FALSE:1017209   FALSE:1017209  
##                                                                 
##  CompetitionDistance CompetitionOpenSinceMonth CompetitionOpenSinceYear
##  Mode :logical       Mode :logical             Mode :logical           
##  FALSE:1014567       FALSE:693861              FALSE:693861            
##  TRUE :2642          TRUE :323348              TRUE :323348            
##    Promo2        Promo2SinceWeek Promo2SinceYear PromoInterval  
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:1017209   FALSE:509178    FALSE:509178    FALSE:1017209  
##                  TRUE :508031    TRUE :508031
#Check when Promo2SinceWeek&Year are simultaneously NA and the Promo2 is exactly 0.
checkPromo2 <- Train_data %>%
  filter(is.na(Train_data$Promo2SinceWeek)==TRUE & is.na(Train_data$Promo2SinceYear)==TRUE)
nrow(checkPromo2)==508031
## [1] TRUE
sum(checkPromo2$Promo2)
## [1] 0
#Check when CompetitionOpenWeek&Year are simultaneously NA
CompetitionOpen <- Train_data %>%
  filter(is.na(Train_data$CompetitionOpenSinceMonth)==TRUE & is.na(Train_data$CompetitionOpenSinceYear)==TRUE)
nrow(CompetitionOpen)==323348
## [1] TRUE

Date format

Train_data <- Train_data %>%
  mutate(date=as.Date(Train_data$Date, format="%Y-%m-%d"))%>%
  mutate(yearMonth=format(as.Date(date),"%Y-%m"))%>%
  mutate(month=month(Date))

average sales and customer without any criteria

#sales distribution
ggplot(data=Train_data,mapping = aes(x=Train_data$Sales))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#customer distribution
ggplot(data=Train_data,mapping = aes(x=Train_data$Customers))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#relationship between sales and customers
ggplot(data = Train_data,mapping = aes(x=Train_data$Customers,y=Train_data$Sales))+
  geom_point()

#store average
storeAvg <- Train_data%>%
  filter(Open==1)%>%
  group_by(Store)%>%
  summarize(mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))
summary(storeAvg)
##      Store         mean(Sales)      sd(Sales)    mean(Customers) 
##  Min.   :   1.0   Min.   : 2704   Min.   : 576   Min.   : 240.2  
##  1st Qu.: 279.5   1st Qu.: 5322   1st Qu.:1403   1st Qu.: 541.5  
##  Median : 558.0   Median : 6590   Median :1773   Median : 678.7  
##  Mean   : 558.0   Mean   : 6934   Mean   :1846   Mean   : 754.5  
##  3rd Qu.: 836.5   3rd Qu.: 7964   3rd Qu.:2154   3rd Qu.: 866.2  
##  Max.   :1115.0   Max.   :21757   Max.   :4872   Max.   :3403.5  
##  sd(Customers)    mean(Sales)/mean(Customers)
##  Min.   : 52.59   Min.   : 3.514             
##  1st Qu.:104.56   1st Qu.: 8.132             
##  Median :134.86   Median : 9.464             
##  Mean   :148.98   Mean   : 9.644             
##  3rd Qu.:174.96   3rd Qu.:10.981             
##  Max.   :654.32   Max.   :16.163
#(Year-Month) average sales, and precent change
avgSales_month <- Train_data %>%
  group_by(yearMonth)%>%
  summarize(mean(Sales))

avgCustomers_month <- Train_data %>%
  group_by(yearMonth)%>%
  summarize(mean(Customers))

Promotion v.s sales & customer: 1. Average sales and customers with promotion is more higher than no promotion, especially in December. 2. Sales per customer spend with promotion is slightly higher than without promotion. 3. Sales per customer spend among months are not much significantly difference, but still can observe that having promotion in Dec. can increase more sales and customer.

#total sales between promo and no promo
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo, y=Train_data$Sales))+
  geom_bar(stat = "identity")

#total customers between promo and no promo
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo, y=Train_data$Customers))+
  geom_bar(stat = "identity")

#(Year-Month) average sales&customer with promo
avgSales_month_promo <- Train_data %>%
  filter(Promo==1)%>%
  group_by(yearMonth)%>%
  summarize(mean(Sales))
avgCustomers_month_promo <- Train_data %>%
  filter(Promo==1)%>%
  group_by(yearMonth)%>%
  summarize(mean(Customers))

#(Year-Month) average sales&customer without promo
avgSales_month_withoutpromo <- Train_data %>%
  filter(Promo==0)%>%
  group_by(yearMonth)%>%
  summarize(mean(Sales))
avgCustomers_month_withoutpromo <- Train_data %>%
  filter(Promo==0)%>%
  group_by(yearMonth)%>%
  summarize(mean(Customers))

#cbind avg sale & avg customer, with promo, without promo
compareAvgSale_month <- cbind(avgSales_month, avgSales_month_promo$`mean(Sales)`, avgSales_month_withoutpromo$`mean(Sales)`)
names(compareAvgSale_month)[3]<-"avgSalesMeanPromo"
names(compareAvgSale_month)[4]<-"avgSalesMeanNoPromo"

compareAvgCustomers_month <- cbind(avgCustomers_month, avgCustomers_month_promo$`mean(Customers)`, avgCustomers_month_withoutpromo$`mean(Customers)`)
names(compareAvgCustomers_month)[3]<-"avgCusMeanPromo"
names(compareAvgCustomers_month)[4]<-"avgCusMeanNoPromo"

#compare average sales by year-month
ggplot(data = compareAvgSale_month, mapping=aes(x=compareAvgSale_month$yearMonth))+
  geom_point(mapping=aes(y=compareAvgSale_month$avgSalesMeanPromo), colour="red")+
  geom_point(mapping=aes(y=compareAvgSale_month$avgSalesMeanNoPromo), colour="blue")

#compare average customers by year-month
ggplot(data = compareAvgCustomers_month, mapping=aes(x=compareAvgCustomers_month$yearMonth))+
  geom_point(mapping=aes(y=compareAvgCustomers_month$avgCusMeanPromo), colour="red")+
  geom_point(mapping=aes(y=compareAvgCustomers_month$avgCusMeanNoPromo), colour="blue")

#(byMonth) average sales&customer with promo
avgSales_month2_promo <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo==1)%>%
  group_by(month(Date))%>%
  summarize(mean(Sales))
names(avgSales_month2_promo)[2] <- "meanSales_promo"

avgCustomers_month2_promo <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo==1)%>%
  group_by(month(Date))%>%
  summarize(mean(Customers))
names(avgCustomers_month2_promo)[2] <- "meanCustomers_promo"

#(byMonth) average sales & customer with no promo
avgSales_month2_noPromo <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo==0)%>%
  group_by(month(Date))%>%
  summarize(mean(Sales))
names(avgSales_month2_noPromo)[2] <- "meanSales_nopromo"

avgCustomers_month2_noPromo <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo==0)%>%
  group_by(month(Date))%>%
  summarize(mean(Customers))
names(avgCustomers_month2_noPromo)[2] <- "meanCustomers_nopromo"

#cbind
compareByMonth_saleCustomer <- cbind(avgSales_month2_promo,avgCustomers_month2_promo[,-1],avgSales_month2_noPromo[,-1],avgCustomers_month2_noPromo[,-1])
compareByMonth_saleCustomer <- compareByMonth_saleCustomer%>%
  mutate(per_promo=compareByMonth_saleCustomer$meanSales_promo/compareByMonth_saleCustomer$meanCustomers_promo,per_noPromo=compareByMonth_saleCustomer$meanSales_nopromo/compareByMonth_saleCustomer$meanCustomers_nopromo)

#compare sales by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanSales_promo),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanSales_nopromo),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average sales")

#compare customer by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanCustomers_promo),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$meanCustomers_nopromo),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average customers")

#compare per customer spend by month
ggplot(data = compareByMonth_saleCustomer, mapping = aes(x=compareByMonth_saleCustomer$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$per_promo),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer$per_noPromo),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average per customer spend")

Store types & Assortment v.s sales&customer: 1.Total sales, customers and sales per customer spend of store type B are lower than other store types, but the average sales, customers and sales per customer spend are higher than other store types, it may because the number of type B store is very few. 2. Total sales, customers and sales per customer spend of assortment B are lower than other assortments, but the average sales, customers and sales per customer spend are higher than other assortments, it may because the number of assortments B is very few. 3. Store types and assortment with promotion seems still a little higher than those without promotion. 4. The store type B with assortment C has the highest average sales and customers.

#store type v.s assortment
ggplot(data = Train_data,mapping = aes(x=Train_data$StoreType,y=Train_data$Assortment))+
  geom_point()

#StoreType: average sale, customer and sales per customer spend
avg_type <- Train_data %>%
  filter(Open==1)%>%
  group_by(StoreType)%>%
  summarize(n(),mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))
#store type: total sales: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$StoreType,y=Train_data$Sales))+
  geom_bar(stat = "identity")+
  xlab("StoreType")+
  ylab("Total sales")

#store type: total customers: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$StoreType,y=Train_data$Customers))+
  geom_bar(stat = "identity")+
  xlab("StoreType")+
  ylab("Total customers")

#store type: average sales
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Sales)`))+
  geom_bar(stat = "identity")+
  xlab("StoreType")+
  ylab("Average sales")

#store type: average customers
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Customers)`))+
  geom_bar(stat = "identity")+
  xlab("StoreType")+
  ylab("Average customers")

#store type: sales per customer spend
ggplot(data=avg_type, mapping = aes(x=avg_type$StoreType,y=avg_type$`mean(Sales)/mean(Customers)`))+
  geom_bar(stat = "identity")+
  xlab("StoreType")+
  ylab("Average sales per customer")

#Assortment: average sale, customer and sales per customer spend
avg_assortment <- Train_data %>%
  filter(Open==1)%>%
  group_by(Assortment)%>%
  summarize(n(),mean(Sales),sd(Sales),mean(Customers),sd(Customers),mean(Sales)/mean(Customers))

#Assortment: total sales: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$Assortment,y=Train_data$Sales))+
  geom_bar(stat = "identity")+
  xlab("Assortment")+
  ylab("Total sales")

#Assortment: total customers: promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$Assortment,y=Train_data$Customers))+
  geom_bar(stat = "identity")+
  xlab("Assortment")+
  ylab("Total customers")

#Assortment: average sales
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Sales)`))+
  geom_bar(stat = "identity")+
  xlab("Assortment")+
  ylab("Average sales")

#Assortment: average customers
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Customers)`))+
  geom_bar(stat = "identity")+
  xlab("Assortment")+
  ylab("Average customers")

#Assortment: sales per customer spend
ggplot(data=avg_assortment, mapping = aes(x=avg_assortment$Assortment,y=avg_assortment$`mean(Sales)/mean(Customers)`))+
  geom_bar(stat = "identity")+
  xlab("Assortment")+
  ylab("Average sales per customer")

#store type * assortment
aa<- Train_data%>%
  filter(Open=="1"&StoreType=="a"&Assortment=="a")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="aa")
names(aa)[1] <-"Sales"
names(aa)[2] <-"Customers"

ab<- Train_data%>%
  filter(Open=="1"&StoreType=="a"&Assortment=="b")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="ab")
names(ab)[1] <-"Sales"
names(ab)[2] <-"Customers"

ac<-Train_data%>%
  filter(Open=="1"&StoreType=="a"&Assortment=="c")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="ac")
names(ac)[1] <-"Sales"
names(ac)[2] <-"Customers"

ba<-Train_data%>%
  filter(Open=="1"&StoreType=="b"&Assortment=="a")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="ba")
names(ba)[1] <-"Sales"
names(ba)[2] <-"Customers"

bb<-Train_data%>%
  filter(Open=="1"&StoreType=="b"&Assortment=="b")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="bb")
names(bb)[1] <-"Sales"
names(bb)[2] <-"Customers"

bc<-Train_data%>%
  filter(Open=="1"&StoreType=="b"&Assortment=="c")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="bc")
names(bc)[1] <-"Sales"
names(bc)[2] <-"Customers"

ca<-Train_data%>%
  filter(Open=="1"&StoreType=="c"&Assortment=="a")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="ca")
names(ca)[1] <-"Sales"
names(ca)[2] <-"Customers"

cb<-Train_data%>%
  filter(Open=="1"&StoreType=="c"&Assortment=="b")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="cb")
names(cb)[1] <-"Sales"
names(cb)[2] <-"Customers"

cc<-Train_data%>%
  filter(Open=="1"&StoreType=="c"&Assortment=="c")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="cc")
names(cc)[1] <-"Sales"
names(cc)[2] <-"Customers"

da<-Train_data%>%
  filter(Open=="1"&StoreType=="d"&Assortment=="a")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="da")
names(da)[1] <-"Sales"
names(da)[2] <-"Customers"

db<-Train_data%>%
  filter(Open=="1"&StoreType=="d"&Assortment=="b")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="db")
names(db)[1] <-"Sales"
names(db)[2] <-"Customers"

dc<-Train_data%>%
  filter(Open=="1"&StoreType=="d"&Assortment=="c")%>%
  summarize(mean(Sales),mean(Customers))%>%
  mutate(index="dc")
names(dc)[1] <-"Sales"
names(dc)[2] <-"Customers"

#store type * assortment: average ales
compareType.Assortment <- rbind(aa,ab,ac,ba,bb,bc,ca,cb,cc,da,db,dc)
ggplotly(
ggplot(data = compareType.Assortment, mapping = aes(x=compareType.Assortment$index,y=compareType.Assortment$Sales))+
  geom_bar(stat = "identity")+
  xlab("Store type * Assortment")+
  ylab("Average sales")
)
## Warning: Removed 3 rows containing missing values (position_stack).
#store type * assortment: average customers
compareType.Assortment <- rbind(aa,ab,ac,ba,bb,bc,ca,cb,cc,da,db,dc)
ggplotly(
ggplot(data = compareType.Assortment, mapping = aes(x=compareType.Assortment$index,y=compareType.Assortment$Customers))+
  geom_bar(stat = "identity")+
  xlab("Store type * Assortment")+
  ylab("Average customers")
)
## Warning: Removed 3 rows containing missing values (position_stack).

DayOfWeek v.s Sales& Customers: 1. The slaes and customers on Sunday are veey low because few stores are opened and storeType C not open on Sunday. 2. While Sunday has the lowest sales and customer, its average sales and customer are the highest. It may because there’s only few stores opened. Its sales per customer spend is also really high even there’s no promotion on Sunday. 3. It seems like Monday has the highest total sales and customer, and its average sales and customer are also very high. Sales per customer spend is the highest on Monday when stores have promotion. 4. Promotion definitely affect sales and customers, surprisingly Saturday and Sunday don’t have promotion.

#Day of week : total sales : store type 
ggplot(data = Train_data, mapping=aes(x=Train_data$DayOfWeek, y=Train_data$Sales))+
  geom_bar(stat = "identity", aes(color=Train_data$StoreType))+
  xlab("Day of week")+
  ylab("Average sales")

#Day of week : total customers : store type 
ggplot(data = Train_data, mapping=aes(x=Train_data$DayOfWeek, y=Train_data$Customers))+
  geom_bar(stat = "identity",aes(color=Train_data$StoreType))+
  xlab("Day of week")+
  ylab("Average customers")

#the opened stores' average sales and customers each day of week
dayofweek_open <- Train_data%>%
  filter(Open==1)%>%
  group_by(DayOfWeek)%>%
  summarize(n(),mean(Sales),mean(Customers))

ggplot(data = dayofweek_open, mapping = aes(x=dayofweek_open$DayOfWeek))+
  geom_bar(stat = "identity", mapping = aes(y=dayofweek_open$`mean(Sales)`),colour="green")+
  geom_bar(stat = "identity", mapping = aes(y=dayofweek_open$`mean(Customers)`),colour="yellow")

#opened stores with promotion or not : average sales and customers
dayofweek_open_promo <- Train_data%>%
  filter(Open==1 & Promo==1)%>%
  group_by(DayOfWeek)%>%
  summarize(n(),mean(Sales),mean(Customers))
dayofweek_open_promo[nrow(dayofweek_open_promo)+1,]=list("6",0,0,0)
dayofweek_open_promo[nrow(dayofweek_open_promo)+1,]=list("7",0,0,0)

dayofweek_open_noPromo <- Train_data%>%
  filter(Open==1 & Promo==0)%>%
  group_by(DayOfWeek)%>%
  summarize(n(),mean(Sales),mean(Customers))
#cbind
compareDayofweek_open <- cbind(dayofweek_open_promo[,-2],dayofweek_open_noPromo[,c(-1,-2)])
names(compareDayofweek_open)[c(2,3,4,5)] <- c("promo_meanSales","promo_meanCustomers","noPromo_meanSales","noPromo_meanSCustomers")
#compare average sales with or without promotion
ggplot(data = compareDayofweek_open, mapping = aes(x=compareDayofweek_open$DayOfWeek))+
  geom_point(mapping = aes(y=compareDayofweek_open$promo_meanSales),colour="red")+
  geom_point(mapping = aes(y=compareDayofweek_open$noPromo_meanSales),colour="blue")+
  xlab("Day of week")+
  ylab("Average sales")

#compare average sales per customer spend with or without promotion
ggplot(data = compareDayofweek_open, mapping = aes(x=compareDayofweek_open$DayOfWeek))+
  geom_point(mapping = aes(y=compareDayofweek_open$promo_meanSales/compareDayofweek_open$promo_meanCustomers),colour="red")+
  geom_point(mapping = aes(y=compareDayofweek_open$noPromo_meanSales/compareDayofweek_open$noPromo_meanSCustomers),colour="blue")
## Warning: Removed 2 rows containing missing values (geom_point).

stateHoliday: 1. No promotion running on any stores on Christmas (c), only store type B and D opened on Christmas and store type C not opened on Easter day 2. Store type B has the significant total sales on each holiday.

#opened stores on state holiday
stateHoliday <- Train_data %>%
  filter(Open==1)%>%
  filter(StateHoliday==c("a","b","c"))
#state holiday : total sales : promotion
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Promo),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
  geom_bar(stat = "identity",position = "dodge")+
  xlab("State holiday")+
  ylab("Total sales")

#state holiday : total sales : store type
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$StoreType),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
  geom_bar(stat = "identity",position = "dodge")+
  xlab("State holiday")+
  ylab("Total sales")

#state holiday : total sales : assortment
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Assortment),x=stateHoliday$StateHoliday, y=stateHoliday$Sales))+
  geom_bar(stat = "identity",position = "dodge")+
  xlab("State holiday")+
  ylab("Total sales")

#state holiday : total customers : promotion
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Promo),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
  geom_bar(stat = "identity",position = "dodge")

#state holiday : total customers : store type
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$StoreType),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
  geom_bar(stat = "identity",position = "dodge")

#state holiday : total customers : assortment
ggplot(data = stateHoliday, mapping = aes(fill=factor(stateHoliday$Assortment),x=stateHoliday$StateHoliday, y=stateHoliday$Customers))+
  geom_bar(stat = "identity",position = "dodge")

schoolHoliday: 1. It seems like there is not a specific store type or assortment belonged to school. 2. Also, the promotion running or not do not affected by whether it is a school store or not.

Train_data%>%
  filter(Open==1&Promo==1)%>%
  group_by(SchoolHoliday)%>%
  summarize(n(),mean(Sales),mean(Customers),mean(Sales)/mean(Customers))
## # A tibble: 2 x 5
##   SchoolHoliday  `n()` `mean(Sales)` `mean(Customers)` `mean(Sales)/mean(~
##           <int>  <int>         <dbl>             <dbl>               <dbl>
## 1             0 299133         8199.              843.                9.72
## 2             1  77763         8341.              849.                9.82
Train_data%>%
  filter(Open==1&Promo==0)%>%
  group_by(SchoolHoliday)%>%
  summarize(n(),mean(Sales),mean(Customers),mean(Sales)/mean(Customers))
## # A tibble: 2 x 5
##   SchoolHoliday  `n()` `mean(Sales)` `mean(Customers)` `mean(Sales)/mean(~
##           <int>  <int>         <dbl>             <dbl>               <dbl>
## 1             0 381802         5876.              691.                8.50
## 2             1  85694         6165.              723.                8.53
#Store type and assortment affected by school holiday
Train_data%>%
  filter(SchoolHoliday==1)%>%
  distinct(Assortment)
##   Assortment
## 1          a
## 2          c
## 3          b
Train_data%>%
  filter(SchoolHoliday==1)%>%
  distinct(StoreType)
##   StoreType
## 1         c
## 2         a
## 3         d
## 4         b
#school holiday : total sales : promotion
ggplot(data = Train_data, mapping = aes(fill=factor(Train_data$Promo),x=Train_data$SchoolHoliday, y=Train_data$Sales))+
  geom_bar(stat = "identity",position = "dodge")+
  xlab("School holiday")+
  ylab("Total sales")

Promo2: Stores which keep participating promotion earn less average sales and customers than those not participate continuing promotion, but its sales per customer spend in store is higher.

#promo2 : total sales 
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo2, y=Train_data$Sales))+
  geom_bar(stat = "identity")

#promo2 : total customers 
ggplot(data = Train_data, mapping=aes(x=Train_data$Promo2, y=Train_data$Customers))+
  geom_bar(stat = "identity")

#(byMonth) opened stores' average sales and customer with promo2
avgSales_month_promo2 <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo2==1)%>%
  group_by(month(Date))%>%
  summarize(mean(Sales))
names(avgSales_month_promo2)[2] <- "meanSales_promo2"

avgCustomers_month_promo2 <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo2==1)%>%
  group_by(month(Date))%>%
  summarize(mean(Customers))
names(avgCustomers_month_promo2)[2] <- "meanCustomers_promo2"

#(byMonth) average sales and customer with no promo2
avgSales_month_noPromo2 <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo2==0)%>%
  group_by(month(Date))%>%
  summarize(mean(Sales))
names(avgSales_month_noPromo2)[2] <- "meanSales_nopromo2"

avgCustomers_month_noPromo2 <- Train_data %>%
  filter(Open==1)%>%
  filter(Promo2==0)%>%
  group_by(month(Date))%>%
  summarize(mean(Customers))
names(avgCustomers_month_noPromo2)[2] <- "meanCustomers_nopromo2"
#cbind
compareByMonth_saleCustomer2 <- cbind(avgSales_month_promo2,avgCustomers_month_promo2[,-1],avgSales_month_noPromo2[,-1],avgCustomers_month_noPromo2[,-1])
compareByMonth_saleCustomer2 <- compareByMonth_saleCustomer2%>%
  mutate(per_promo2=compareByMonth_saleCustomer2$meanSales_promo2/compareByMonth_saleCustomer2$meanCustomers_promo2,per_noPromo2=compareByMonth_saleCustomer2$meanSales_nopromo2/compareByMonth_saleCustomer2$meanCustomers_nopromo2)

#compare average sales by month with promo2
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanSales_promo2),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanSales_nopromo2),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average sales")

#compare average customers by month
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanCustomers_promo2),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$meanCustomers_nopromo2),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average customers")

#compare per customer spend by month
ggplot(data = compareByMonth_saleCustomer2, mapping = aes(x=compareByMonth_saleCustomer2$`month(Date)`))+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$per_promo2),colour="red")+
  geom_line(mapping = aes(y=compareByMonth_saleCustomer2$per_noPromo2),colour="blue")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("Average per customer spend")

promotion vs continuing promotion: The stores’ average sales with promotion as well as continuing promotion are lower than those do not participate continuing promotion. It may because the longer period drag down the average. But the average per customer spend with promotion as well as continuing promotion is still higher than others.

#the average sales and customers with all conbinations of promo and promo2
Promo11 <- Train_data%>%
  filter(Open==1 & Promo==1 & Promo2==1)%>%
  group_by(month)%>%
  summarize(mean(Sales),mean(Customers))
names(Promo11)[2] <- "promo11meanSales"
names(Promo11)[3] <- "promo11meanCustomers"

Promo01 <- Train_data%>%
  filter(Open==1 & Promo==0 & Promo2==1)%>%
  group_by(month)%>%
  summarize(mean(Sales),mean(Customers))
names(Promo01)[2] <- "promo01meanSales"
names(Promo01)[3] <- "promo01meanCustomers"

Promo10 <- Train_data%>%
  filter(Open==1 & Promo==1 & Promo2==0)%>%
  group_by(month)%>%
  summarize(mean(Sales),mean(Customers))
names(Promo10)[2] <- "promo10meanSales"
names(Promo10)[3] <- "promo10meanCustomers"

Promo00 <- Train_data%>%
  filter(Open==1 & Promo==0 & Promo2==0)%>%
  group_by(month)%>%
  summarize(mean(Sales),mean(Customers))
names(Promo00)[2] <- "promo00meanSales"
names(Promo00)[3] <- "promo00meanCustomers"
#cbind
comparePromo <- cbind(Promo00,Promo01[,-1],Promo10[,-1],Promo11[,-1])
#average sales 
ggplot(data = comparePromo, mapping = aes(x=comparePromo$month))+
  geom_line(mapping = aes(y=comparePromo$promo00meanSales))+
  geom_line(mapping = aes(y=comparePromo$promo01meanSales),colour="orange")+
  geom_line(mapping = aes(y=comparePromo$promo10meanSales),colour="blue")+
  geom_line(mapping = aes(y=comparePromo$promo11meanSales),colour="red")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("average sales")

#average sales per customer spend
ggplot(data = comparePromo, mapping = aes(x=comparePromo$month))+
  geom_line(mapping = aes(y=comparePromo$promo00meanSales/comparePromo$promo00meanCustomers))+
  geom_line(mapping = aes(y=comparePromo$promo01meanSales/comparePromo$promo01meanCustomers),colour="orange")+
  geom_line(mapping = aes(y=comparePromo$promo10meanSales/comparePromo$promo10meanCustomers),colour="blue")+
  geom_line(mapping = aes(y=comparePromo$promo11meanSales/comparePromo$promo11meanCustomers),colour="red")+
  scale_x_continuous(breaks = 1:12)+
  xlab("Month")+
  ylab("sales per customer spend")

predict sales Assume customer numbers are average of that month (using history data)

Train_data <- Train_data%>%
  mutate(dayOfWeekDummy=ifelse(DayOfWeek==7,0,1))
Train_train <- Train_data%>%
  filter(Open=="1")

rs_test <- sample(nrow(Train_train),nrow(Train_train)-800000,replace = FALSE)
Train_test <- Train_train[rs_test,]
Train_train2 <- Train_train[-(rs_test),]
  
#test data
Test_data <- left_join(Test, Store, by="Store")
Test_data <- Test_data %>%
  mutate(date=as.Date(Test_data$Date, format="%Y-%m-%d"))%>%
  mutate(yearMonth=format(as.Date(date),"%Y-%m"))%>%
  mutate(month=month(Date))
Test_data <- Test_data%>%
  mutate(dayOfWeekDummy=ifelse(DayOfWeek==7,0,1))
Test_data <- Test_data%>%
  filter(Open=="1")
#get average customers in August and September
Customers89 <- Train_data%>%
  filter(month==c(8,9))%>%
  group_by(Store)%>%
  summarize(mean(Customers))
## Warning in month == c(8, 9): longer object length is not a multiple of
## shorter object length
names(Customers89)[2]<-"Customers"
Test_data <- left_join(Test_data, Customers89, by="Store")

############################################################################
#model
lm_Sales1 <- lm(Sales~dayOfWeekDummy+as.factor(month)+Promo+StateHoliday+SchoolHoliday+StoreType+Assortment+Promo2+CompetitionDistance, Train_train)
summary(lm_Sales1)
## 
## Call:
## lm(formula = Sales ~ dayOfWeekDummy + as.factor(month) + Promo + 
##     StateHoliday + SchoolHoliday + StoreType + Assortment + Promo2 + 
##     CompetitionDistance, data = Train_train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10519  -1771   -409   1264  35365 
## 
## Coefficients:
##                       Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)          5.892e+03  4.936e+01  119.361  < 2e-16 ***
## dayOfWeekDummy      -2.477e+02  4.816e+01   -5.144 2.69e-07 ***
## as.factor(month)2    1.038e+02  1.346e+01    7.708 1.28e-14 ***
## as.factor(month)3    3.428e+02  1.323e+01   25.917  < 2e-16 ***
## as.factor(month)4    3.999e+02  1.343e+01   29.768  < 2e-16 ***
## as.factor(month)5    5.338e+02  1.347e+01   39.624  < 2e-16 ***
## as.factor(month)6    4.868e+02  1.337e+01   36.417  < 2e-16 ***
## as.factor(month)7    1.870e+02  1.351e+01   13.848  < 2e-16 ***
## as.factor(month)8   -7.524e+01  1.560e+01   -4.824 1.41e-06 ***
## as.factor(month)9   -3.299e+00  1.521e+01   -0.217   0.8283    
## as.factor(month)10  -6.857e+00  1.521e+01   -0.451   0.6522    
## as.factor(month)11   5.174e+02  1.533e+01   33.763  < 2e-16 ***
## as.factor(month)12   2.057e+03  1.542e+01  133.443  < 2e-16 ***
## Promo                2.317e+03  6.029e+00  384.300  < 2e-16 ***
## StateHolidaya        2.449e+01  1.048e+02    0.234   0.8153    
## StateHolidayb        5.764e+02  2.284e+02    2.524   0.0116 *  
## StateHolidayc       -1.337e+03  3.263e+02   -4.098 4.16e-05 ***
## SchoolHoliday        2.994e+02  8.357e+00   35.830  < 2e-16 ***
## StoreTypeb           5.039e+03  3.322e+01  151.697  < 2e-16 ***
## StoreTypec          -1.124e+02  9.164e+00  -12.265  < 2e-16 ***
## StoreTyped          -2.083e+02  7.013e+00  -29.696  < 2e-16 ***
## Assortmentb         -3.022e+03  4.406e+01  -68.588  < 2e-16 ***
## Assortmentc          8.533e+02  6.261e+00  136.285  < 2e-16 ***
## Promo2              -7.581e+02  6.093e+00 -124.415  < 2e-16 ***
## CompetitionDistance -2.376e-02  3.968e-04  -59.883  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2741 on 842181 degrees of freedom
##   (2186 observations deleted due to missingness)
## Multiple R-squared:  0.2207, Adjusted R-squared:  0.2206 
## F-statistic:  9936 on 24 and 842181 DF,  p-value: < 2.2e-16
lm_Sales2 <- lm(Sales~DayOfWeek+Promo+SchoolHoliday+StoreType+Assortment+Promo2+CompetitionDistance, Train_train2)
summary(lm_Sales2)
## 
## Call:
## lm(formula = Sales ~ DayOfWeek + Promo + SchoolHoliday + StoreType + 
##     Assortment + Promo2 + CompetitionDistance, data = Train_train2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -10309  -1818   -449   1263  35031 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6.616e+03  1.035e+01  639.04   <2e-16 ***
## DayOfWeek           -1.480e+02  1.899e+00  -77.93   <2e-16 ***
## Promo                2.164e+03  6.519e+00  332.01   <2e-16 ***
## SchoolHoliday        1.264e+02  7.929e+00   15.94   <2e-16 ***
## StoreTypeb           5.148e+03  3.365e+01  153.01   <2e-16 ***
## StoreTypec          -1.117e+02  9.522e+00  -11.73   <2e-16 ***
## StoreTyped          -2.113e+02  7.283e+00  -29.02   <2e-16 ***
## Assortmentb         -3.022e+03  4.575e+01  -66.06   <2e-16 ***
## Assortmentc          8.573e+02  6.502e+00  131.86   <2e-16 ***
## Promo2              -7.656e+02  6.327e+00 -121.01   <2e-16 ***
## CompetitionDistance -2.349e-02  4.110e-04  -57.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2771 on 797920 degrees of freedom
##   (2069 observations deleted due to missingness)
## Multiple R-squared:  0.2033, Adjusted R-squared:  0.2033 
## F-statistic: 2.036e+04 on 10 and 797920 DF,  p-value: < 2.2e-16
#assuming the number of customers is the average of the previous years
lm_Sales3 <- lm(Sales~Customers+Promo+StoreType, Train_train2)
summary(lm_Sales3)
## 
## Call:
## lm(formula = Sales ~ Customers + Promo + StoreType, data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28443.1   -814.4   -135.4    660.9  28414.4 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.937e+02  4.084e+00  145.37   <2e-16 ***
## Customers    7.288e+00  4.456e-03 1635.43   <2e-16 ***
## Promo        1.194e+03  3.149e+00  379.33   <2e-16 ***
## StoreTypeb  -5.555e+03  1.270e+01 -437.45   <2e-16 ***
## StoreTypec  -1.412e+02  4.666e+00  -30.26   <2e-16 ***
## StoreTyped   1.275e+03  3.554e+00  358.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1366 on 799994 degrees of freedom
## Multiple R-squared:  0.8063, Adjusted R-squared:  0.8063 
## F-statistic: 6.661e+05 on 5 and 799994 DF,  p-value: < 2.2e-16
ggplot(lm_Sales3, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lm_Sales3,aes(sample=.stdresid))+stat_qq()+geom_abline()

#best
lm_Sales3.1 <- lm(Sales~Customers*StoreType+Promo, Train_train2)
summary(lm_Sales3.1)
## 
## Call:
## lm(formula = Sales ~ Customers * StoreType + Promo, data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28513.5   -771.4    -89.8    662.4  28353.0 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.226e+02  4.528e+00  137.50   <2e-16 ***
## Customers             7.307e+00  5.149e-03 1419.14   <2e-16 ***
## StoreTypeb           -6.487e+02  2.631e+01  -24.66   <2e-16 ***
## StoreTypec           -3.063e+02  1.145e+01  -26.74   <2e-16 ***
## StoreTyped           -5.049e+02  9.506e+00  -53.11   <2e-16 ***
## Promo                 1.096e+03  2.998e+00  365.71   <2e-16 ***
## Customers:StoreTypeb -2.441e+00  1.276e-02 -191.29   <2e-16 ***
## Customers:StoreTypec  2.022e-01  1.301e-02   15.54   <2e-16 ***
## Customers:StoreTyped  2.942e+00  1.410e-02  208.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1293 on 799991 degrees of freedom
## Multiple R-squared:  0.8265, Adjusted R-squared:  0.8265 
## F-statistic: 4.764e+05 on 8 and 799991 DF,  p-value: < 2.2e-16
ggplot(lm_Sales3.1, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(lm_Sales3.1,aes(sample=.stdresid))+stat_qq()+geom_abline()

lm_Sales4 <- lm(Sales~Customers+Promo+CompetitionDistance, Train_train2)
summary(lm_Sales4)
## 
## Call:
## lm(formula = Sales ~ Customers + Promo + CompetitionDistance, 
##     data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21087.3   -912.7   -148.1    790.8  29449.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.468e+03  4.352e+00   337.3   <2e-16 ***
## Customers           6.147e+00  4.598e-03  1336.8   <2e-16 ***
## Promo               1.393e+03  3.672e+00   379.3   <2e-16 ***
## CompetitionDistance 3.258e-02  2.323e-04   140.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1602 on 797927 degrees of freedom
##   (2069 observations deleted due to missingness)
## Multiple R-squared:  0.7336, Adjusted R-squared:  0.7336 
## F-statistic: 7.324e+05 on 3 and 797927 DF,  p-value: < 2.2e-16
ggplot(lm_Sales4, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lm_Sales5 <- lm(Sales~Customers+Promo+Assortment, Train_train2)
summary(lm_Sales5)
## 
## Call:
## lm(formula = Sales ~ Customers + Promo + Assortment, data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24095.4   -896.2   -152.7    744.9  28421.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.088e+03  3.992e+00   272.5   <2e-16 ***
## Customers    6.618e+00  4.361e-03  1517.7   <2e-16 ***
## Promo        1.304e+03  3.332e+00   391.3   <2e-16 ***
## Assortmentb -6.618e+03  1.759e+01  -376.3   <2e-16 ***
## Assortmentc  6.521e+02  3.270e+00   199.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1452 on 799995 degrees of freedom
## Multiple R-squared:  0.7811, Adjusted R-squared:  0.7811 
## F-statistic: 7.138e+05 on 4 and 799995 DF,  p-value: < 2.2e-16
ggplot(lm_Sales5, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lm_Sales6 <- lm(Sales~Customers+Promo+Promo2, Train_train2)
summary(lm_Sales6)
## 
## Call:
## lm(formula = Sales ~ Customers + Promo + Promo2, data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20891.3   -941.9   -155.1    802.6  29268.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.575e+03  4.737e+00  332.51   <2e-16 ***
## Customers   6.106e+00  4.686e-03 1303.05   <2e-16 ***
## Promo       1.397e+03  3.704e+00  377.22   <2e-16 ***
## Promo2      1.992e+02  3.697e+00   53.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1618 on 799996 degrees of freedom
## Multiple R-squared:  0.7284, Adjusted R-squared:  0.7284 
## F-statistic: 7.152e+05 on 3 and 799996 DF,  p-value: < 2.2e-16
ggplot(lm_Sales6, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lm_Sales7 <- lm(Sales~Customers+Promo+SchoolHoliday, Train_train2)
summary(lm_Sales7)
## 
## Call:
## lm(formula = Sales ~ Customers + Promo + SchoolHoliday, data = Train_train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20617.4   -948.1   -156.7    807.2  29441.7 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.694e+03  4.095e+00   413.7   <2e-16 ***
## Customers     6.052e+00  4.593e-03  1317.6   <2e-16 ***
## Promo         1.403e+03  3.708e+00   378.4   <2e-16 ***
## SchoolHoliday 9.816e+01  4.587e+00    21.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1620 on 799996 degrees of freedom
## Multiple R-squared:  0.7276, Adjusted R-squared:  0.7276 
## F-statistic: 7.122e+05 on 3 and 799996 DF,  p-value: < 2.2e-16
ggplot(lm_Sales7, aes(x=.resid))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#prediction using model 3.1
salesPred1 <- predict(lm_Sales1,newdata = Test_data,interval = "prediction")
summary(salesPred1)
##       fit             lwr             upr       
##  Min.   : 4014   Min.   :-1358   Min.   : 9386  
##  1st Qu.: 5519   1st Qu.:  147   1st Qu.:10890  
##  Median : 6474   Median : 1102   Median :11845  
##  Mean   : 6645   Mean   : 1273   Mean   :12016  
##  3rd Qu.: 7880   3rd Qu.: 2509   3rd Qu.:13252  
##  Max.   :14121   Max.   : 8749   Max.   :19493  
##  NA's   :65      NA's   :65      NA's   :65
plot(salesPred1)

Train_test$predSales3.1 <- predict(lm_Sales3.1, newdata= Train_test)

Test_data$predSales <- predict(lm_Sales3.1, newdata= Test_data)

ggplot(data = Test_data, mapping = aes(x=Test_data$Store,y=Test_data$predSales))+
  geom_bar(stat = "identity")